# load data
data <- read_csv('Speed Dating Data.csv')
# we save the data as separate people worked on the coding side with differen variable names
SpeedDatingData <- data
# select the relevant rows for our LBS survey
data2 <- data %>%
select(age,
age_o,
gender,
from,
imprace,
imprelig,
goal,
date,
go_out,
career,
int_corr,
career,
career_c,
sports:yoga,
attr1_1:shar1_1,
attr4_1:shar4_1,
attr2_1:shar2_1,
attr3_1:amb3_1,
attr5_1:amb5_1)Thao and Sijue kindly ran through some visualisations and cleaning of the speed-dating datset
SpeedDatingData.AvgAttributes = SpeedDatingData %>% filter(!is.na(attr1_1)) %>%
filter(!is.na(sinc1_1)) %>%
filter(!is.na(intel1_1)) %>%
filter(!is.na(fun1_1)) %>%
filter(!is.na(amb1_1)) %>%
filter(!is.na(shar1_1)) %>%
group_by(gender) %>%
summarise(AvgAttractive1 = round(mean(attr1_1),0),
AvgSincere1 = round(mean(sinc1_1),0),
AvgIntel1 = round(mean(intel1_1),0),
AvgFun1 = round(mean(fun1_1),0),
AvgAmb1 = round(mean(amb1_1),0),
AvgShar1 = round(mean(shar1_1),0)
)
DataAnalysisSpeedDating <- function(SpeedDatingData.AvgAttributes)
{
SpeedDatingData.AttractiveOnly = SpeedDatingData.AvgAttributes %>%
select(gender,AvgAttractive1) %>%
mutate(score = AvgAttractive1) %>%
mutate(TypeOfAttribute = "Attractive") %>%
select(gender,score,TypeOfAttribute)
SpeedDatingData.SincereOnly = SpeedDatingData.AvgAttributes %>%
select(gender,AvgSincere1) %>%
mutate(score = AvgSincere1) %>%
mutate(TypeOfAttribute = "Sincere")%>%
select(gender,score,TypeOfAttribute)
SpeedDatingData.FunOnly = SpeedDatingData.AvgAttributes %>%
select(gender,AvgFun1) %>%
mutate(score = AvgFun1) %>%
mutate(TypeOfAttribute = "Fun") %>%
select(gender,score,TypeOfAttribute)
SpeedDatingData.AmbOnly = SpeedDatingData.AvgAttributes %>%
select(gender,AvgAmb1) %>%
mutate(score = AvgAmb1) %>%
mutate(TypeOfAttribute = "Ambiton") %>%
select(gender,score,TypeOfAttribute)
SpeedDatingData.SharOnly = SpeedDatingData.AvgAttributes %>%
select(gender,AvgShar1) %>%
mutate(score = AvgShar1) %>%
mutate(TypeOfAttribute = "SharedInterests")%>%
select(gender,score,TypeOfAttribute)
SpeedDatingData.IntelOnly = SpeedDatingData.AvgAttributes %>%
select(gender,AvgIntel1) %>%
mutate(score = AvgIntel1) %>%
mutate(TypeOfAttribute = "Intelligence")%>%
select(gender,score,TypeOfAttribute)
SpeedDatingData.Summary1 = rbind(SpeedDatingData.AttractiveOnly,
SpeedDatingData.SincereOnly,
SpeedDatingData.FunOnly,
SpeedDatingData.AmbOnly,
SpeedDatingData.SharOnly,
SpeedDatingData.IntelOnly)
SpeedDatingData.Summary1$TypeOfAttribute = as.factor(SpeedDatingData.Summary1$TypeOfAttribute)
return(SpeedDatingData.Summary1)
}library(gridExtra)
#Before the date
SpeedDatingData.Summary1 = DataAnalysisSpeedDating(SpeedDatingData.AvgAttributes)
SpeedDatingData.Summary1.Male = SpeedDatingData.Summary1 %>% filter(gender == 1)
before1_men<- ggplot(SpeedDatingData.Summary1.Male , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "cadetblue3") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Males') +
coord_flip() +
theme_bw()
SpeedDatingData.Summary1.Female = SpeedDatingData.Summary1 %>% filter(gender == 0)
before1_women <-ggplot(SpeedDatingData.Summary1.Female , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "hotpink1") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Female') +
coord_flip() +
theme_bw()
# plot side by side
grid.arrange(before1_men,before1_women, ncol=2)Attractiveness is the most important to men but only of medium importance for women. Intelligence is very crucial for both genders, first for female and second for male. Shared interest and Ambition are two least popular attributes that participants are looking from partners.
#After the date
SpeedDatingData.AvgAttributes5 = SpeedDatingData %>% filter(!is.na(attr1_2)) %>%
filter(!is.na(sinc1_2)) %>%
filter(!is.na(intel1_2)) %>%
filter(!is.na(fun1_2)) %>%
filter(!is.na(amb1_2)) %>%
filter(!is.na(shar1_2)) %>%
group_by(gender) %>%
summarise(AvgAttractive1 = round(mean(attr1_2),0),
AvgSincere1 = round(mean(sinc1_2),0),
AvgIntel1 = round(mean(intel1_2),0),
AvgFun1 = round(mean(fun1_2),0),
AvgAmb1 = round(mean(amb1_2),0),
AvgShar1 = round(mean(shar1_2),0)
)
SpeedDatingData.Summary1 = DataAnalysisSpeedDating(SpeedDatingData.AvgAttributes5)
SpeedDatingData.Summary1.Male = SpeedDatingData.Summary1 %>% filter(gender == 1)
before2_men <- ggplot(SpeedDatingData.Summary1.Male , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "cadetblue3") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Males') +
coord_flip() +
theme_bw()
SpeedDatingData.Summary1.Female = SpeedDatingData.Summary1 %>% filter(gender == 0)
before2_women <- ggplot(SpeedDatingData.Summary1.Female , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "hotpink1") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Female') +
coord_flip() +
theme_bw()
# plot side by side
grid.arrange(before2_men,before2_women, ncol=2)Male: Attractiveness is the only top 3 attribute showing increase, reinforcing the dominant position, leaving the closest attribute 12 points apart. Fun outpassed Intelligence to become the second most important attribute Female: Attractiveness rose by 4 points to raise from third to first position. Intelligence and Sincere fell to the following positions.
#3-4 weeks later
SpeedDatingData.AvgAttributes7 = SpeedDatingData %>% filter(!is.na(attr1_3)) %>%
filter(!is.na(sinc1_3)) %>%
filter(!is.na(intel1_3)) %>%
filter(!is.na(fun1_3)) %>%
filter(!is.na(amb1_3)) %>%
filter(!is.na(shar1_3)) %>%
group_by(gender) %>%
summarise(AvgAttractive1 = round(mean(attr1_3),0),
AvgSincere1 = round(mean(sinc1_3),0),
AvgIntel1 = round(mean(intel1_3),0),
AvgFun1 = round(mean(fun1_3),0),
AvgAmb1 = round(mean(amb1_3),0),
AvgShar1 = round(mean(shar1_3),0)
)
SpeedDatingData.Summary1 = DataAnalysisSpeedDating(SpeedDatingData.AvgAttributes7)
SpeedDatingData.Summary1.Male = SpeedDatingData.Summary1 %>% filter(gender == 1)
before3_men <- ggplot(SpeedDatingData.Summary1.Male , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "cadetblue3") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Males') +
coord_flip() +
theme_bw()
SpeedDatingData.Summary1.Female = SpeedDatingData.Summary1 %>% filter(gender == 0)
before3_women <- ggplot(SpeedDatingData.Summary1.Female , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "hotpink1") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Female') +
coord_flip() +
theme_bw()
# plot side by side
grid.arrange(before3_men,before3_women, ncol=2)Male: Intelligence outpassed Fun to stand in the second position. Perhaps, after 4-6 weeks, they talked to each other more and figured out that Intelligence is more important
Female: the gap between attractive and intelligence is now smaller (3 to 1 point only)
#Before the date
SpeedDatingData.AvgAttributes9 = SpeedDatingData %>% filter(!is.na(attr2_1)) %>%
filter(!is.na(sinc2_1)) %>%
filter(!is.na(intel2_1)) %>%
filter(!is.na(fun2_1)) %>%
filter(!is.na(amb2_1)) %>%
filter(!is.na(shar2_1)) %>%
group_by(gender) %>%
summarise(AvgAttractive1 = round(mean(attr2_1),0),
AvgSincere1 = round(mean(sinc2_1),0),
AvgIntel1 = round(mean(intel2_1),0),
AvgFun1 = round(mean(fun2_1),0),
AvgAmb1 = round(mean(amb2_1),0),
AvgShar1 = round(mean(shar2_1),0)
)
SpeedDatingData.Summary1 = DataAnalysisSpeedDating(SpeedDatingData.AvgAttributes7)
SpeedDatingData.Summary1.Male = SpeedDatingData.Summary1 %>% filter(gender == 1)
before4_men <- ggplot(SpeedDatingData.Summary1.Male , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "cadetblue3") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Males') +
coord_flip() +
theme_bw()
SpeedDatingData.Summary1.Female = SpeedDatingData.Summary1 %>% filter(gender == 0)
before4_women <- ggplot(SpeedDatingData.Summary1.Female , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "hotpink1") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Female') +
coord_flip() +
theme_bw()
# plot side by side
grid.arrange(before4_men,before4_women, ncol=2)#After the date
SpeedDatingData.AvgAttributes11 = SpeedDatingData %>% filter(!is.na(attr2_2)) %>%
filter(!is.na(sinc2_2)) %>%
filter(!is.na(intel2_2)) %>%
filter(!is.na(fun2_2)) %>%
filter(!is.na(amb2_2)) %>%
filter(!is.na(shar2_2)) %>%
group_by(gender) %>%
summarise(AvgAttractive1 = round(mean(attr2_2),0),
AvgSincere1 = round(mean(sinc2_2),0),
AvgIntel1 = round(mean(intel2_2),0),
AvgFun1 = round(mean(fun2_2),0),
AvgAmb1 = round(mean(amb2_2),0),
AvgShar1 = round(mean(shar2_2),0)
)
SpeedDatingData.Summary1 = DataAnalysisSpeedDating(SpeedDatingData.AvgAttributes7)
SpeedDatingData.Summary1.Male = SpeedDatingData.Summary1 %>% filter(gender == 1)
before5_men <- ggplot(SpeedDatingData.Summary1.Male , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "cadetblue3") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Males') +
coord_flip() +
theme_bw()
SpeedDatingData.Summary1.Female = SpeedDatingData.Summary1 %>% filter(gender == 0)
before5_women <- ggplot(SpeedDatingData.Summary1.Female , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "hotpink1") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Female') +
coord_flip() +
theme_bw()
# plot side by side
grid.arrange(before5_men,before5_women, ncol=2)#3-4 weeks later
SpeedDatingData.AvgAttributes11 = SpeedDatingData %>% filter(!is.na(attr2_2)) %>%
filter(!is.na(sinc2_3)) %>%
filter(!is.na(intel2_3)) %>%
filter(!is.na(fun2_3)) %>%
filter(!is.na(amb2_3)) %>%
filter(!is.na(shar2_3)) %>%
group_by(gender) %>%
summarise(AvgAttractive1 = round(mean(attr2_3),0),
AvgSincere1 = round(mean(sinc2_3),0),
AvgIntel1 = round(mean(intel2_3),0),
AvgFun1 = round(mean(fun2_3),0),
AvgAmb1 = round(mean(amb2_3),0),
AvgShar1 = round(mean(shar2_3),0)
)
SpeedDatingData.Summary1 = DataAnalysisSpeedDating(SpeedDatingData.AvgAttributes7)
SpeedDatingData.Summary1.Male = SpeedDatingData.Summary1 %>% filter(gender == 1)
before6_men <- ggplot(SpeedDatingData.Summary1.Male , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "cadetblue3") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Males') +
coord_flip() +
theme_bw()
SpeedDatingData.Summary1.Female = SpeedDatingData.Summary1 %>% filter(gender == 0)
before6_women <- ggplot(SpeedDatingData.Summary1.Female , aes(x = reorder(TypeOfAttribute, score),
y = score)) +
geom_bar(stat='identity',colour="white", fill = "hotpink1") +
geom_text(aes(x = TypeOfAttribute, y = 1, label = paste0("(",score,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'TypeOfAttribute', y = 'Score', title = 'TypeOfAttribute and Score - Female') +
coord_flip() +
theme_bw()
# plot side by side
grid.arrange(before6_men,before6_women, ncol=2) age age_o gender from imprace imprelig goal date
95 104 0 79 79 79 79 97
go_out career int_corr career_c sports tvsports exercise dining
79 89 158 138 79 79 79 79
museums art hiking gaming clubbing reading tv theater
79 79 79 79 79 79 79 79
movies concerts music shopping yoga attr1_1 sinc1_1 intel1_1
79 79 79 79 79 79 79 79
fun1_1 amb1_1 shar1_1 attr4_1 sinc4_1 intel4_1 fun4_1 amb4_1
89 99 121 8378 7997 8204 8319 7693
shar4_1 attr2_1 sinc2_1 intel2_1 fun2_1 amb2_1 shar2_1 attr3_1
8059 79 79 79 79 89 89 105
sinc3_1 fun3_1 intel3_1 amb3_1 attr5_1 sinc5_1 intel5_1 fun5_1
105 105 105 105 8378 8368 8378 8378
amb5_1
8363
# too many na's in attr4-shar4 and attr5-amb5, let's drop columns with >30% na
data3 <- data2[, -which(colMeans(is.na(data2)) > 0.3)]
# drop the character columns for correlation
data_cor <- data %>%
select(-career, -field, -from)
# there are lots of columns with over half na, let's drop the columns with more than 30% na
data_cor <- data_cor[, -which(colMeans(is.na(data_cor)) > 0.3)]
# Find the correlations and round to 2 dp
cor_mat <- round(cor(data_cor, use='complete.obs', method='spearman'),2)
head(cor_mat) iid id gender idg condtn wave round position order partner
iid 1.00 0.06 0.08 0.06 0.23 1.00 0.30 0.10 0.06 0.06
id 0.06 1.00 0.04 1.00 0.30 0.02 0.38 0.16 0.16 0.15
gender 0.08 0.04 1.00 0.08 0.00 0.01 0.07 0.04 0.00 0.00
idg 0.06 1.00 0.08 1.00 0.31 0.02 0.39 0.17 0.16 0.16
condtn 0.23 0.30 0.00 0.31 1.00 0.23 0.63 0.33 0.31 0.31
wave 1.00 0.02 0.01 0.02 0.23 1.00 0.30 0.09 0.06 0.06
pid match int_corr samerace age_o race_o pf_o_att pf_o_sin
iid 0.99 0.00 0.06 -0.04 0.12 0.12 0.04 0.00
id 0.02 0.00 -0.03 0.02 -0.01 -0.04 0.00 0.00
gender -0.05 -0.02 0.01 -0.01 -0.14 0.06 -0.45 0.12
idg 0.02 0.00 -0.03 0.02 -0.01 -0.03 -0.02 0.00
condtn 0.23 -0.05 0.04 0.06 0.15 -0.07 0.01 0.06
wave 1.00 0.01 0.07 -0.04 0.13 0.12 0.08 -0.01
pf_o_int pf_o_fun pf_o_amb pf_o_sha dec_o attr_o sinc_o intel_o
iid -0.08 0.04 -0.08 0.02 0.01 -0.03 -0.02 -0.01
id 0.02 0.00 -0.03 -0.01 0.00 0.03 -0.03 -0.04
gender 0.04 -0.03 0.37 0.09 -0.10 -0.10 -0.04 0.05
idg 0.03 0.00 -0.01 0.00 -0.01 0.03 -0.03 -0.03
condtn 0.06 0.00 -0.09 -0.04 -0.04 -0.03 -0.05 -0.03
wave -0.08 0.04 -0.11 0.01 0.02 -0.03 -0.02 -0.01
fun_o amb_o shar_o like_o prob_o met_o age field_cd race imprace
iid 0.03 -0.01 -0.01 -0.03 0.03 0.06 0.08 0.08 0.12 -0.04
id 0.02 -0.01 0.00 0.00 -0.02 0.05 0.09 -0.02 -0.09 0.02
gender -0.05 0.10 -0.03 -0.05 -0.01 0.00 0.12 -0.09 -0.07 -0.13
idg 0.01 -0.01 -0.01 0.00 -0.02 0.05 0.09 -0.02 -0.10 0.02
condtn -0.04 -0.05 -0.03 -0.04 -0.06 0.07 0.15 0.07 -0.08 0.03
wave 0.03 -0.01 -0.01 -0.03 0.03 0.07 0.06 0.09 0.13 -0.03
imprelig zipcode goal date go_out career_c sports tvsports
iid -0.10 0.06 -0.01 0.06 0.03 0.10 0.01 0.04
id 0.04 0.00 -0.09 0.01 -0.06 0.14 0.09 0.09
gender -0.23 -0.15 0.03 -0.14 -0.03 -0.03 0.26 0.14
idg 0.03 0.00 -0.09 0.00 -0.06 0.13 0.10 0.10
condtn 0.10 -0.02 -0.03 0.06 0.02 0.00 0.05 0.03
wave -0.08 0.08 0.00 0.07 0.04 0.10 -0.01 0.03
exercise dining museums art hiking gaming clubbing reading tv
iid 0.00 0.12 0.08 0.12 0.00 0.00 0.00 -0.02 0.00
id 0.02 0.04 -0.05 0.00 0.03 0.04 -0.03 0.03 0.03
gender -0.10 -0.21 -0.28 -0.29 -0.09 0.24 -0.09 -0.25 -0.19
idg 0.02 0.03 -0.06 -0.01 0.03 0.05 -0.03 0.01 0.02
condtn -0.01 -0.04 0.02 0.04 -0.04 0.00 -0.08 0.05 0.03
wave 0.00 0.12 0.10 0.14 0.00 -0.02 0.01 -0.01 0.01
theater movies concerts music shopping yoga exphappy attr1_1
iid 0.03 0.03 0.13 0.15 0.12 0.03 0.11 0.12
id 0.04 -0.02 0.01 0.04 -0.02 -0.09 -0.06 0.12
gender -0.40 -0.18 -0.17 -0.08 -0.37 -0.21 0.21 0.48
idg 0.02 -0.03 0.00 0.04 -0.04 -0.10 -0.05 0.14
condtn -0.01 0.04 -0.01 0.00 0.02 -0.06 0.06 0.01
wave 0.06 0.04 0.14 0.15 0.15 0.04 0.10 0.09
sinc1_1 intel1_1 fun1_1 amb1_1 shar1_1 attr2_1 sinc2_1 intel2_1
iid -0.02 -0.14 0.06 -0.12 -0.01 0.13 -0.09 -0.15
id -0.06 -0.05 0.01 -0.12 -0.08 0.00 0.00 -0.07
gender -0.13 -0.02 0.01 -0.43 -0.08 -0.39 0.34 0.30
idg -0.07 -0.05 0.00 -0.13 -0.09 -0.02 0.02 -0.05
condtn 0.07 0.03 -0.01 -0.06 -0.02 0.06 -0.03 -0.03
wave -0.01 -0.14 0.06 -0.09 0.00 0.16 -0.12 -0.17
fun2_1 amb2_1 shar2_1 attr3_1 sinc3_1 fun3_1 intel3_1 amb3_1 dec
iid -0.02 0.01 -0.03 0.06 0.08 0.08 0.02 0.06 0.00
id 0.02 0.00 -0.02 -0.02 0.09 0.08 0.02 0.05 -0.02
gender -0.04 0.35 -0.15 -0.08 -0.18 -0.15 0.07 -0.03 0.10
idg 0.01 0.01 -0.03 -0.02 0.09 0.07 0.03 0.05 -0.01
condtn -0.07 -0.04 -0.02 -0.01 0.04 0.02 0.04 0.00 -0.04
wave -0.03 -0.02 -0.02 0.06 0.08 0.08 0.01 0.05 0.00
attr sinc intel fun amb shar like prob met match_es
iid -0.03 -0.01 -0.02 0.03 -0.01 -0.03 -0.06 0.00 -0.54 0.11
id -0.05 -0.02 -0.02 -0.04 -0.05 -0.03 -0.04 -0.02 -0.06 -0.03
gender 0.08 -0.03 -0.13 -0.01 -0.17 -0.03 0.01 -0.01 -0.04 0.12
idg -0.05 -0.02 -0.03 -0.04 -0.05 -0.03 -0.04 -0.02 -0.06 -0.02
condtn -0.01 -0.02 0.01 -0.03 -0.02 -0.04 -0.02 -0.04 -0.23 0.15
wave -0.04 -0.01 -0.01 0.02 -0.01 -0.03 -0.06 0.00 -0.55 0.11
satis_2 length numdat_2 attr1_2 sinc1_2 intel1_2 fun1_2 amb1_2
iid 0.07 -0.05 0.02 0.02 0.01 0.04 -0.03 -0.05
id 0.00 -0.01 -0.04 0.10 -0.17 0.00 -0.04 -0.11
gender 0.19 -0.10 -0.04 0.40 -0.11 -0.17 0.00 -0.31
idg 0.01 -0.02 -0.04 0.12 -0.17 -0.01 -0.03 -0.12
condtn 0.16 -0.10 0.10 0.08 0.01 -0.01 0.05 -0.09
wave 0.06 -0.05 0.02 -0.01 0.02 0.05 -0.03 -0.03
shar1_2 attr3_2 sinc3_2 intel3_2 fun3_2 amb3_2
iid -0.02 0.03 0.02 0.00 0.11 0.05
id -0.04 -0.06 0.05 0.05 0.03 0.07
gender -0.21 -0.05 -0.18 0.09 -0.15 -0.01
idg -0.05 -0.06 0.05 0.05 0.02 0.07
condtn -0.03 -0.01 0.00 0.02 0.01 0.02
wave 0.00 0.03 0.03 -0.01 0.12 0.05
# prepare the correlation matrix
melted_cor_mat <- melt(cor_mat)
# filter for relevant values because there are too many
melted_match <- melted_cor_mat %>%
filter(X1 == 'match') %>%
filter(value >= 0.1 | value <= -0.1) # only select values with correlation
# plot
ggplot(data = melted_match, aes(x=X1, y=X2, fill=value)) +
geom_tile()+
scale_fill_gradient2(low='blue', high='red',mid='white') +
labs(title = 'Correlation Heatmap for Match',
x = 'Variable - Match',
y = 'Variables with a correlation > 0.1')The heatmap with all the high correlations (above 0.3) are columns which participants complete after a speed date. This is an issue for when we try to model and predict for LBS students, as we only have the pre-date self-evaluation data available.
Let’s try to show the data in a better format - we’re going to build a radar chart
#First, data is checked for consistency since some of the participants will place the ranks differently than others (on a 1-10 scale compared to using a distribution of 100 points).
#take related attributes with iid and gender into new data frame
data_radar<-
data %>%
group_by(gender) %>%
select(iid, gender,
attr1_1,
sinc1_1,
intel1_1,
fun1_1,
amb1_1,
shar1_1) %>%
unique()
#Next, we would like to turn all NA into 0, but before this, we check if any entries in iid or gender is NA to prevent mislabels
sum(is.na(data_radar$iid))[1] 0
[1] 0
#Apply command to change all NA to 0
data_radar[is.na(data_radar)] <- 0
#Add column to check if total of attributions add up to 100
data_radar$total <- rowSums(data_radar[,c("attr1_1",
"sinc1_1",
"intel1_1",
"fun1_1",
"amb1_1",
"shar1_1")])
# table(data_radar$total)
#A total of 0 means all entries are missing and row is dropped
data_radar<-
data_radar %>%
filter(!total == "0")
#As there are entry errors in the data, all points are redistributed and curved to fit 100 total points
data_radar$attr1_1 <- round(data_radar$attr1_1/data_radar$total*100, digits = 2)
data_radar$sinc1_1 <- round(data_radar$sinc1_1/data_radar$total*100, digits = 2)
data_radar$intel1_1 <- round(data_radar$intel1_1/data_radar$total*100, digits = 2)
data_radar$fun1_1 <- round(data_radar$fun1_1/data_radar$total*100, digits = 2)
data_radar$amb1_1 <- round(data_radar$amb1_1/data_radar$total*100, digits = 2)
data_radar$shar1_1 <- round(data_radar$shar1_1/data_radar$total*100, digits = 2)
data_radar$total <- rowSums(data_radar[,c("attr1_1",
"sinc1_1",
"intel1_1",
"fun1_1",
"amb1_1",
"shar1_1")])
data_radar$total <- round(data_radar$total, digits = 0)
table(data_radar$total)
100
544
# visualise data in a radar chart
test1 <-
data_radar %>%
group_by(gender) %>%
summarise(Attractive = mean(attr1_1),
Sincere = mean(sinc1_1),
Intelligent = mean(intel1_1),
Fun = mean(fun1_1),
Ambitious = mean(amb1_1),
Interest = mean(shar1_1))
test1forplot <-
test1 %>%
select(-gender)
maxmin <- data.frame(
Attractive = c(36, 0),
Sincere = c(36, 0),
Intelligent = c(36, 0),
Fun = c(36, 0),
Ambitious = c(36, 0),
Interest = c(36, 0))
test11 <- rbind(maxmin, test1forplot)
test11male <- test11[c(1,2,4),]
test11female <- test11[c(1,2,3),]
radarchart(test11,
pty = 32,
axistype = 0,
pcol = c(adjustcolor("hotpink1", 0.5), adjustcolor("cadetblue2", 0.5)),
pfcol = c(adjustcolor("hotpink1", 0.5), adjustcolor("cadetblue2", 0.5)),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
c("Male", "Female"),
fill = c(adjustcolor("cadetblue2", 0.5), adjustcolor("hotpink1", 0.5)))# load LBS data
LBS_data <- read_csv('Finding your soulmate no names.csv')
# change the names of LBS data so it is consistent with our previous dataset
# Pull the column names from data3
col_names <- colnames(data3)
col_names [1] "age" "age_o" "gender" "from" "imprace" "imprelig"
[7] "goal" "date" "go_out" "career" "int_corr" "career_c"
[13] "sports" "tvsports" "exercise" "dining" "museums" "art"
[19] "hiking" "gaming" "clubbing" "reading" "tv" "theater"
[25] "movies" "concerts" "music" "shopping" "yoga" "attr1_1"
[31] "sinc1_1" "intel1_1" "fun1_1" "amb1_1" "shar1_1" "attr2_1"
[37] "sinc2_1" "intel2_1" "fun2_1" "amb2_1" "shar2_1" "attr3_1"
[43] "sinc3_1" "fun3_1" "intel3_1" "amb3_1"
[1] "Numbers"
[2] "Age"
[3] "Gender"
[4] "Sexual orientation"
[5] "Where are you from originally? (Country)"
[6] "How important is it to you (on a scale of 1-10) that a person you date be of the same cultural/ethnic background?"
[7] "What is your primary goal in participating in this event?"
[8] "In general, how frequently do you go on dates?"
[9] "How often do you go out (not necessarily on dates)?"
[10] "What is your intended career?"
[11] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Sport]"
[12] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [TV sports]"
[13] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Exercise]"
[14] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Dining]"
[15] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Museums]"
[16] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Art]"
[17] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Hiking]"
[18] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Gaming]"
[19] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Clubbing]"
[20] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Reading]"
[21] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [TV]"
[22] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Theater]"
[23] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Movies]"
[24] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Concerts]"
[25] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Music]"
[26] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Shopping]"
[27] "How interested are you in the following activities, on a scale of 1-10? (1 = no interest, 10 = very interested) [Yoga]"
[28] "What attributes matter more for YOU in a potential date? Rank them from most important (1) to least important (6) [Attractive]"
[29] "What attributes matter more for YOU in a potential date? Rank them from most important (1) to least important (6) [Sincere]"
[30] "What attributes matter more for YOU in a potential date? Rank them from most important (1) to least important (6) [Intelligent]"
[31] "What attributes matter more for YOU in a potential date? Rank them from most important (1) to least important (6) [Fun]"
[32] "What attributes matter more for YOU in a potential date? Rank them from most important (1) to least important (6) [Ambitious]"
[33] "What attributes matter more for YOU in a potential date? Rank them from most important (1) to least important (6) [Shared Interests]"
[34] "What attributes you think matter more for YOUR FELLOW MEN / WOMEN in a potential date? Rank them from most important (1) to least important (6). [Attractive]"
[35] "What attributes you think matter more for YOUR FELLOW MEN / WOMEN in a potential date? Rank them from most important (1) to least important (6). [Sincere]"
[36] "What attributes you think matter more for YOUR FELLOW MEN / WOMEN in a potential date? Rank them from most important (1) to least important (6). [Intelligent]"
[37] "What attributes you think matter more for YOUR FELLOW MEN / WOMEN in a potential date? Rank them from most important (1) to least important (6). [Fun]"
[38] "What attributes you think matter more for YOUR FELLOW MEN / WOMEN in a potential date? Rank them from most important (1) to least important (6). [Ambitious]"
[39] "What attributes you think matter more for YOUR FELLOW MEN / WOMEN in a potential date? Rank them from most important (1) to least important (6). [Shared Interests]"
[40] "What attributes you think matter more to YOUR POTENTIAL PARTNER in a potential date? Rank them from most important (1) to least important (6). [Attractive]"
[41] "What attributes you think matter more to YOUR POTENTIAL PARTNER in a potential date? Rank them from most important (1) to least important (6). [Sincere]"
[42] "What attributes you think matter more to YOUR POTENTIAL PARTNER in a potential date? Rank them from most important (1) to least important (6). [Intelligent]"
[43] "What attributes you think matter more to YOUR POTENTIAL PARTNER in a potential date? Rank them from most important (1) to least important (6). [Fun]"
[44] "What attributes you think matter more to YOUR POTENTIAL PARTNER in a potential date? Rank them from most important (1) to least important (6). [Ambitious]"
[45] "What attributes you think matter more to YOUR POTENTIAL PARTNER in a potential date? Rank them from most important (1) to least important (6). [Shared Interests]"
[46] "How do you think you measure up? Please rate your opinion of your own attributes, on a scale of 1 (low) - 10 (high): [Attractive]"
[47] "How do you think you measure up? Please rate your opinion of your own attributes, on a scale of 1 (low) - 10 (high): [Sincere]"
[48] "How do you think you measure up? Please rate your opinion of your own attributes, on a scale of 1 (low) - 10 (high): [Intelligent]"
[49] "How do you think you measure up? Please rate your opinion of your own attributes, on a scale of 1 (low) - 10 (high): [Fun]"
[50] "How do you think you measure up? Please rate your opinion of your own attributes, on a scale of 1 (low) - 10 (high): [Ambitious]"
[51] "And finally, how do you think others perceive you? Please rate yourself how you think others would rate you on each of the following attributes, on a scale of 1 (low) - 10 (high) [Attractive]"
[52] "And finally, how do you think others perceive you? Please rate yourself how you think others would rate you on each of the following attributes, on a scale of 1 (low) - 10 (high) [Sincere]"
[53] "And finally, how do you think others perceive you? Please rate yourself how you think others would rate you on each of the following attributes, on a scale of 1 (low) - 10 (high) [Intelligent]"
[54] "And finally, how do you think others perceive you? Please rate yourself how you think others would rate you on each of the following attributes, on a scale of 1 (low) - 10 (high) [Fun]"
[55] "And finally, how do you think others perceive you? Please rate yourself how you think others would rate you on each of the following attributes, on a scale of 1 (low) - 10 (high) [Ambitious]"
[56] "Would you like to share your code with matched partner?"
# create a new vector
new_col_names <- c('iid',
'age',
'gender',
'orientation',
'from',
'imprace',
'goal',
'date',
'go_out',
'career')
# append hobbies and attr1_1:shar1_1 (important attributes in date partner)
new_col_names <- append(new_col_names,col_names[13:35])
# append attr4_1:shar4_1 (what you think fellow men/women find important)
new_col_names <- append(new_col_names, c('attr4_1',
'sinc4_1',
'intel4_1',
'fun4_1',
'amb4_1',
'shar4_1'))
# append attr2_1:shar2_1 (what do you think the opposite sex find important)
new_col_names <- append(new_col_names, c('attr2_1',
'sinc2_1',
'intel2_1',
'fun2_1',
'amb2_1',
'shar2_1'))
# append attr3_1:amb3_1 (how do you think you measure up)
new_col_names <- append(new_col_names, c('attr3_1',
'sinc3_1',
'intel3_1',
'fun3_1',
'amb3_1'))
# append how others perceive you
new_col_names <- append(new_col_names, c('attr5_1',
'sinc5_1',
'intel5_1',
'fun5_1',
'amb5_1'))
# append share code y/n
new_col_names <- append(new_col_names, 'share_code')
# Change the columns names of LBS_data
colnames(LBS_data) <- new_col_names
# Clean the from column
LBS_data <- LBS_data %>%
mutate(from = case_when(
from %in% c('China',
'China/Finland',
"China/Portugal depending on what you mean") ~ 'China',
from %in% c('Italia',
'Italy') ~ 'Italy',
from == 'France / Switzerland' ~ 'France',
from == 'Peru/Ukraine' ~ 'Peru',
from %in% c('United Kingdom',
'UK') ~ 'UK',
TRUE ~ from
))
# mutate for general european/asian comparison
LBS_data <- LBS_data %>%
mutate(from_cont = case_when(
from %in% c('China',
'Vietnam',
'India',
'Singapore',
'Macau',
'Pakistan',
'Malaysia') ~ 'Asia',
from %in% c('Italy',
'France',
'Germany',
'Sweden',
'Switzerland',
'UK',
'Russia',
'Poland',
'Slovakia') ~ 'Europe',
# from %in% c('USA',
# 'Canada') ~ 'North America',
# from %in% c('Peru',
# 'Argentina') ~ 'South America',
TRUE ~ 'Other'
))
# Rechange the attributes rating so the results are more noticeable for radar plots
# gather
tidy <- LBS_data %>%
gather(key='attribute',
value = 'rating',
attr1_1:shar2_1)
# mutate
tidy <- tidy %>%
mutate(rating = case_when(
rating == 1 ~ 12,
rating == 2 ~ 6,
rating == 3 ~ 2,
rating == 4 ~ 1,
rating == 5 ~ 0,
rating == 6 ~ 0
))
# spread
LBS_clean <- tidy %>%
spread(key='attribute',
value ='rating')The reason we mutated all the scored was because the format of the online survey (ranking from 1 to 6) meant that the small fluctuations in mean ratings were smoothed out by the data preparation method for the radar charts. The above method artificially spikes the data so the fluctuations are more noticeable. This needs to be a point of care for future analysis.
# See career choices split by gender
careers <- LBS_clean %>%
group_by(career, gender) %>%
count(career) %>%
arrange(desc(career))
# plot
ggplot(careers, aes(x = reorder(career, n),
y = n,
fill = factor(gender))) +
geom_bar(stat='identity', position='dodge') +
scale_fill_discrete(name='Gender') +
labs(title = 'Careers Split by Gender',
x = 'Career',
y = 'Number of People') +
coord_flip()# see interests split by gender
interest <- LBS_clean %>%
gather(key = 'intr',
value='score',
sports:yoga)
interest_summary <- interest %>%
select(intr, score, gender) %>%
group_by(gender, intr) %>%
summarise(avg = mean(score))
# plot
ggplot(interest_summary, aes(x = reorder(intr,avg), y = avg, fill = factor(gender))) +
geom_bar(stat='identity') +
scale_fill_discrete(name='Gender') +
labs(title = 'Interest Split by Gender',
x = 'Interest',
y = 'Mean Score') +
coord_flip()# Curve the attributes of this set of attributes
LBS_clean$total <- rowSums(LBS_clean[,c("attr1_1",
"sinc1_1",
"intel1_1",
"fun1_1",
"amb1_1",
"shar1_1")])
# the points are redistributed and curved to fit 100 total points
LBS_clean$attr1_1 <- round(LBS_clean$attr1_1/LBS_clean$total*100, digits = 2)
LBS_clean$sinc1_1 <- round(LBS_clean$sinc1_1/LBS_clean$total*100, digits = 2)
LBS_clean$intel1_1 <- round(LBS_clean$intel1_1/LBS_clean$total*100, digits = 2)
LBS_clean$fun1_1 <- round(LBS_clean$fun1_1/LBS_clean$total*100, digits = 2)
LBS_clean$amb1_1 <- round(LBS_clean$amb1_1/LBS_clean$total*100, digits = 2)
LBS_clean$shar1_1 <- round(LBS_clean$shar1_1/LBS_clean$total*100, digits = 2)
# create a smaller dataframe, grouped by gender for the radar chart
test2 <-LBS_clean %>%
group_by(gender) %>%
summarise(Attractive = mean(attr1_1),
Sincere = mean(sinc1_1),
Intelligent = mean(intel1_1),
Fun = mean(fun1_1),
Ambitious = mean(amb1_1),
Interest = mean(shar1_1))
# modify for plotting
test2forplot <-
test2 %>%
select(-gender)
# create the boundaries for the radar chart
maxmin <- data.frame(
Attractive = c(36, 0),
Sincere = c(36, 0),
Intelligent = c(36, 0),
Fun = c(36, 0),
Ambitious = c(36, 0),
Interest = c(36, 0))
# add it to the plot dataframe
test21 <- rbind(maxmin, test2forplot)
# separate the dataframe into two separate ones for plotting
test21male <- test21[c(1,2,4),]
test21female <- test21[c(1,2,3),]
# plot the radar chart
radarchart(test21,
pty = 32,
axistype = 0,
pcol = c(adjustcolor("hotpink1", 0.5), adjustcolor("cadetblue2", 0.5)),
pfcol = c(adjustcolor("hotpink1", 0.5), adjustcolor("cadetblue2", 0.5)),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
c("Male", "Female"),
fill = c(adjustcolor("cadetblue2", 0.5), adjustcolor("hotpink1", 0.5)))# create a smaller dataframe, grouped by gender for the radar chart
test3 <-LBS_clean %>%
group_by(from) %>%
summarise(Attractive = mean(attr1_1),
Sincere = mean(sinc1_1),
Intelligent = mean(intel1_1),
Fun = mean(fun1_1),
Ambitious = mean(amb1_1),
Interest = mean(shar1_1))
# Find out which countries had the most responses
LBS_clean %>%
group_by(from) %>%
count() %>%
arrange(desc(n))# A tibble: 22 x 2
# Groups: from [22]
from n
<chr> <int>
1 Italy 14
2 France 13
3 China 11
4 India 7
5 Germany 6
6 Switzerland 5
7 Russia 3
8 USA 3
9 UK 2
10 America 1
# ... with 12 more rows
# modify for plotting and select the countries with most responses
test3forplot <-
test3 %>% filter(from %in% c('Italy',
'France',
'China',
'India')) %>%
select(-from)
# add it to the plot dataframe
test_country <- rbind(maxmin, test3forplot)
# separate the dataframe into two separate ones for plotting
test_china <- test_country[c(1,2,3),]
test_france <- test_country[c(1,2,4),]
test_india <- test_country[c(1,2,5),]
test_italy <- test_country[c(1,2,6),]
# plot the radar charts by country
# China
radarchart(test_china,
pty = 32,
axistype = 0,
pcol = adjustcolor("#e34a33", 0.5),
pfcol = adjustcolor("#e34a33", 0.5),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
"China",
fill = adjustcolor("#e34a33", 0.5))# India
radarchart(test_india,
pty = 32,
axistype = 0,
pcol = adjustcolor("#feb24c", 0.5),
pfcol = adjustcolor("#feb24c", 0.5),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
"India",
fill = adjustcolor("#feb24c", 0.5))# France
radarchart(test_france,
pty = 32,
axistype = 0,
pcol = adjustcolor("#3182bd", 0.5),
pfcol = adjustcolor("#3182bd", 0.5),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
"France",
fill = adjustcolor("#3182bd", 0.5))# Italy
radarchart(test_italy,
pty = 32,
axistype = 0,
pcol = adjustcolor("#31a354", 0.5),
pfcol = adjustcolor("#31a354", 0.5),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
"Italy",
fill = adjustcolor("#31a354", 0.5))# France and Italy overlaid
radarchart(test_country[c(1,2,4,6),],
pty = 32,
axistype = 0,
pcol = c(adjustcolor("#3182bd", 0.5),
adjustcolor("#31a354", 0.5)),
pfcol = c(adjustcolor("#3182bd", 0.5),
adjustcolor("#31a354", 0.5)),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
c("France","Italy"),
fill = c(adjustcolor("#3182bd", 0.5),
adjustcolor("#31a354", 0.5)))# China and India Overlaid
radarchart(test_country[c(1,2,3,5),],
pty = 32,
axistype = 0,
pcol = c(adjustcolor("#e34a33", 0.5),
adjustcolor("#feb24c", 0.5)),
pfcol = c(adjustcolor("#e34a33", 0.5),
adjustcolor("#feb24c", 0.5)),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
c("China","India"),
fill = c(adjustcolor("#e34a33", 0.5),
adjustcolor("#feb24c", 0.5)))Some interesting observations here. The Italian students want a partner who is attractive, intelligent and fun, whilst the French students do not care about fun at all but demand someone who is ambitious.
# split the divisions by gender and country
# create a smaller dataframe, grouped by gender for the radar chart
rad_ita <-LBS_clean %>%
group_by(gender, from) %>%
summarise(Attractive = mean(attr1_1),
Sincere = mean(sinc1_1),
Intelligent = mean(intel1_1),
Fun = mean(fun1_1),
Ambitious = mean(amb1_1),
Interest = mean(shar1_1))
# modify for plotting
rad_ita_forplot <- rad_ita %>%
filter(from == 'Italy')
# Couldn't get the code to work, so I have just summarised the data in a table
print(rad_ita)# A tibble: 30 x 8
# Groups: gender [2]
gender from Attractive Sincere Intelligent Fun Ambitious Interest
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Female Argentina 0 0 4.76 28.6 9.52 57.1
2 Female China 20.0 5.71 11.4 16.7 24.8 21.4
3 Female France 15.0 15.6 10.2 2.72 39.5 17.0
4 Female Germany 31.0 22.6 10.7 17.9 1.19 16.7
5 Female India 28.6 4.76 0 0 9.52 57.1
6 Female Italy 7.93 24.6 25.4 31.0 5.56 5.56
7 Female Morocco 9.52 4.76 28.6 0 0 57.1
8 Female Russia 9.52 57.1 28.6 0 0 4.76
9 Female Singapore 0 57.1 0 4.76 9.52 28.6
10 Female Slovakia 4.76 0 9.52 57.1 0 28.6
# ... with 20 more rows
# write it to csv
write.csv(rad_ita, 'Table - country and gender split')
# rad_ita_forplot <- rad_ita_forplot %>%
# select(-gender -from)
# # add it to the plot dataframe
# rad_ita_forplot <- rbind(maxmin, rad_ita_forplot)
#
#
# # plot the radar chart
# radarchart(rad_ita_forplot,
# pty = 32,
# axistype = 0,
# pcol = c(adjustcolor("hotpink1", 0.5), adjustcolor("cadetblue2", 0.5)),
# pfcol = c(adjustcolor("hotpink1", 0.5), adjustcolor("cadetblue2", 0.5)),
# plty = 1,
# plwd = 3,
# cglty = 1,
# cglcol = "gray88",
# centerzero = TRUE,
# seg = 5,
# vlcex = 0.75,
# palcex = 0.75)
#
# legend("topleft",
# c("Male", "Female"),
# fill = c(adjustcolor("cadetblue2", 0.5), adjustcolor("hotpink1", 0.5)))# create a smaller dataframe, grouped by gender for the radar chart
test_cont <-LBS_clean %>%
group_by(from_cont) %>%
summarise(Attractive = mean(attr1_1),
Sincere = mean(sinc1_1),
Intelligent = mean(intel1_1),
Fun = mean(fun1_1),
Ambitious = mean(amb1_1),
Interest = mean(shar1_1))
# modify for plotting and select the countries with most responses
test_cont_forplot <- test_cont %>%
select(-from_cont)
# add it to the plot dataframe
test_cont_forplot <- rbind(maxmin, test_cont_forplot)
# Europe
radarchart(test_cont_forplot[c(1,2,4),],
pty = 32,
axistype = 0,
pcol = adjustcolor("#003399", 0.7),
pfcol = adjustcolor("#003399", 0.7),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
"Europe",
fill = adjustcolor("#003399",0.7))# Just Asia
radarchart(test_cont_forplot[c(1,2,4),],
pty = 32,
axistype = 0,
pcol = adjustcolor("#fd8d3c", 0.7),
pfcol = adjustcolor("#fd8d3c", 0.7),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
"Asia",
fill = adjustcolor("#fd8d3c",0.7))# Asia, Europe and Other overlaid
radarchart(test_cont_forplot[c(1,2,3,4,5),],
pty = 32,
axistype = 0,
pcol = c(adjustcolor("#fd8d3c", 0.5),
adjustcolor("#003399", 0.5),
adjustcolor("#d9d9d9",0.5)),
pfcol = c(adjustcolor("#fd8d3c", 0.5),
adjustcolor("#003399", 0.5),
adjustcolor("#d9d9d9",0.5)),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
c("Asia","Europe","Other"),
fill = c(adjustcolor("#fd8d3c", 0.5),
adjustcolor("#003399", 0.5),
adjustcolor("#d9d9d9",0.5)))# Overlay just asia and europe
radarchart(test_cont_forplot[c(1,2,3,4),],
pty = 32,
axistype = 0,
pcol = c(adjustcolor("#fd8d3c", 0.5),
adjustcolor("#003399", 0.5)),
pfcol = c(adjustcolor("#fd8d3c", 0.5),
adjustcolor("#003399", 0.5)),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
c("Asia","Europe"),
fill = c(adjustcolor("#fd8d3c", 0.5),
adjustcolor("#003399", 0.5)))# copy a new df
perceptions_df <- LBS_data
# total for
perceptions_df$total <- rowSums(perceptions_df[,c("attr3_1",
"sinc3_1",
"intel3_1",
"fun3_1",
"amb3_1")])
perceptions_df$total2 <- rowSums(perceptions_df[,c("attr5_1",
"sinc5_1",
"intel5_1",
"fun5_1",
"amb5_1")])
# clean relevant data
# the points are redistributed and curved to fit 100 total points (selves)
perceptions_df$attr3_1 <- round((perceptions_df$attr3_1/perceptions_df$total)*100, digits = 2)
perceptions_df$sinc3_1 <- round((perceptions_df$sinc3_1/perceptions_df$total)*100, digits = 2)
perceptions_df$intel3_1 <- round((perceptions_df$intel3_1/perceptions_df$total)*100, digits = 2)
perceptions_df$fun3_1 <- round((perceptions_df$fun3_1/perceptions_df$total)*100, digits = 2)
perceptions_df$amb3_1 <- round((perceptions_df$amb3_1/perceptions_df$total)*100, digits = 2)
# clean the datapoints of how others percieve you
# the points are redistributed and curved to fit 100 total points
perceptions_df$attr5_1 <- round((perceptions_df$attr5_1/perceptions_df$total2)*100, digits = 2)
perceptions_df$sinc5_1 <- round((perceptions_df$sinc5_1/perceptions_df$total2)*100, digits = 2)
perceptions_df$intel5_1 <- round((perceptions_df$intel5_1/perceptions_df$total2)*100, digits = 2)
perceptions_df$fun5_1 <- round((perceptions_df$fun5_1/perceptions_df$total2)*100, digits = 2)
perceptions_df$amb5_1 <- round((perceptions_df$amb5_1/perceptions_df$total2)*100, digits = 2)
# Create a dataframe to store data of how people see themselves
test_selves <-LBS_clean %>%
group_by(gender) %>%
summarise(Attractive = mean(attr3_1),
Sincere = mean(sinc3_1),
Intelligent = mean(intel3_1),
Fun = mean(fun3_1),
Ambitious = mean(amb3_1))
# Create a dataframe to store data of how people think other's perceive them
test_others <- LBS_clean %>%
group_by(gender) %>%
summarise(Attractive = mean(attr5_1),
Sincere = mean(sinc5_1),
Intelligent = mean(intel5_1),
Fun = mean(fun5_1),
Ambitious = mean(amb5_1))
# modify for plotting
test_selves_forplot <-
test_selves %>%
select(-gender)
# modify for plotting
test_others_forplot <-
test_others %>%
select(-gender)
# new maxmin
maxmin2 <- data.frame(
Attractive = c(12, 0),
Sincere = c(12, 0),
Intelligent = c(12, 0),
Fun = c(12, 0),
Ambitious = c(12, 0))
# create relevant plotting df
men <- rbind(maxmin2,
test_selves_forplot[1,],
test_others_forplot[2,])
women <- rbind(maxmin2,
test_selves_forplot[2,],
test_others_forplot[1,])# plot men
radarchart(men,
pty = 32,
axistype = 0,
pcol = c(adjustcolor("peachpuff4", 0.5),
adjustcolor("#9ebcda", 0.5)),
pfcol = c(adjustcolor("peachpuff4", 0.5),
adjustcolor("#9ebcda", 0.5)),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
c("How men see themselves","How men think others see them"),
pt.cex = 1,
cex = 0.8,
fill = c(adjustcolor("peachpuff4", 0.5),
adjustcolor("#9ebcda", 0.5)))# plot women
radarchart(women,
pty = 32,
axistype = 0,
pcol = c(adjustcolor("#fa9fb5", 0.5),
adjustcolor("#756bb1", 0.5)),
pfcol = c(adjustcolor("#fa9fb5", 0.5),
adjustcolor("#756bb1", 0.5)),
plty = 1,
plwd = 3,
cglty = 1,
cglcol = "gray88",
centerzero = TRUE,
seg = 5,
vlcex = 0.75,
palcex = 0.75)
legend("topleft",
c("How women see themselves","How women think others see them"),
pt.cex = 1,
cex = 0.8,
fill = c(adjustcolor("#fa9fb5", 0.5),
adjustcolor("#756bb1", 0.5)))# create a dataset with columns relevant to the model
LBS_modeldf <- LBS_data %>%
select(-orientation, -share_code, - from_cont)
# create the relevant speed-dating dataset
model_df <- data %>%
select(colnames(LBS_modeldf), match)
# drop columns with more 30% na
model_df <- model_df[, -which(colMeans(is.na(model_df)) > 0.3)]
# create model
model1 <- glm(match ~ . -iid -from - career ,
family = 'binomial'(link=logit),
data=model_df)
# summary
summary(model1)
Call:
glm(formula = match ~ . - iid - from - career, family = binomial(link = logit),
data = model_df)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.9435 -0.6302 -0.5489 -0.4546 2.5696
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.021726 2.483191 -0.814 0.415551
age -0.032937 0.009759 -3.375 0.000738 ***
gender 0.026974 0.095052 0.284 0.776574
imprace -0.044908 0.012032 -3.732 0.000190 ***
goal -0.022969 0.023109 -0.994 0.320237
date -0.063495 0.023731 -2.676 0.007460 **
go_out -0.083062 0.032580 -2.549 0.010789 *
sports 0.021446 0.015993 1.341 0.179942
tvsports -0.014319 0.014261 -1.004 0.315365
exercise -0.013105 0.014634 -0.895 0.370530
dining 0.039751 0.022876 1.738 0.082265 .
museums -0.061957 0.033074 -1.873 0.061031 .
art 0.064546 0.028756 2.245 0.024795 *
hiking 0.011282 0.013762 0.820 0.412340
gaming 0.004676 0.013610 0.344 0.731193
clubbing 0.044979 0.013714 3.280 0.001038 **
reading 0.039912 0.017629 2.264 0.023577 *
tv 0.022632 0.016541 1.368 0.171252
theater 0.009440 0.019712 0.479 0.631991
movies -0.062631 0.023547 -2.660 0.007818 **
concerts 0.031119 0.022227 1.400 0.161499
music 0.005755 0.024693 0.233 0.815714
shopping -0.036704 0.016335 -2.247 0.024643 *
yoga 0.016118 0.012598 1.279 0.200778
attr1_1 -0.026705 0.017091 -1.563 0.118165
sinc1_1 -0.025610 0.017243 -1.485 0.137479
intel1_1 -0.014695 0.017413 -0.844 0.398709
fun1_1 -0.006486 0.017352 -0.374 0.708567
amb1_1 -0.015655 0.017162 -0.912 0.361663
shar1_1 -0.034379 0.017204 -1.998 0.045677 *
attr2_1 0.035407 0.018489 1.915 0.055489 .
sinc2_1 0.029999 0.019058 1.574 0.115456
intel2_1 0.041303 0.018994 2.174 0.029668 *
fun2_1 0.021884 0.018790 1.165 0.244152
amb2_1 0.034536 0.018528 1.864 0.062327 .
shar2_1 0.019740 0.018815 1.049 0.294109
attr3_1 0.055693 0.029531 1.886 0.059308 .
sinc3_1 0.011987 0.025824 0.464 0.642534
intel3_1 -0.026254 0.035040 -0.749 0.453701
fun3_1 0.010546 0.026991 0.391 0.696003
amb3_1 -0.037358 0.022239 -1.680 0.092989 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 7305.7 on 8180 degrees of freedom
Residual deviance: 7124.2 on 8140 degrees of freedom
(197 observations deleted due to missingness)
AIC: 7206.2
Number of Fisher Scoring iterations: 4
There are several variables which are significant in predicting whether a match is found. Let’s choose the relevant ones and see how our model improves
# select the relevant model variables (for the LBS dataset)
model2 <- glm(as.factor(match) ~ age +
imprace +
date +
go_out +
art +
clubbing +
reading +
movies +
shopping +
shar1_1 +
intel2_1,
family = 'binomial'(link=logit),
data=model_df)
# summarise
summary(model2)
Call:
glm(formula = as.factor(match) ~ age + imprace + date + go_out +
art + clubbing + reading + movies + shopping + shar1_1 +
intel2_1, family = binomial(link = logit), data = model_df)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.9667 -0.6320 -0.5678 -0.4719 2.3452
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.295326 0.341740 -0.864 0.38749
age -0.034049 0.009125 -3.731 0.00019 ***
imprace -0.049357 0.011139 -4.431 9.38e-06 ***
date -0.072521 0.022114 -3.279 0.00104 **
go_out -0.111223 0.030689 -3.624 0.00029 ***
art 0.047256 0.014959 3.159 0.00158 **
clubbing 0.049680 0.012744 3.898 9.68e-05 ***
reading 0.033075 0.016202 2.041 0.04121 *
movies -0.044639 0.018730 -2.383 0.01716 *
shopping -0.016265 0.012707 -1.280 0.20054
shar1_1 -0.017223 0.004752 -3.625 0.00029 ***
intel2_1 0.007904 0.004766 1.658 0.09728 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 7367.4 on 8216 degrees of freedom
Residual deviance: 7232.3 on 8205 degrees of freedom
(161 observations deleted due to missingness)
AIC: 7256.3
Number of Fisher Scoring iterations: 4
# remove shopping as it is no longer significant
model3 <- glm(as.factor(match) ~ age +
imprace +
date +
go_out +
art +
clubbing +
reading +
movies +
shar1_1 +
intel2_1,
family = 'binomial'(link=logit),
data=model_df)
# summarise
summary(model3)
Call:
glm(formula = as.factor(match) ~ age + imprace + date + go_out +
art + clubbing + reading + movies + shar1_1 + intel2_1, family = binomial(link = logit),
data = model_df)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.9536 -0.6300 -0.5687 -0.4735 2.3230
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.376373 0.335770 -1.121 0.262319
age -0.032580 0.009041 -3.603 0.000314 ***
imprace -0.051503 0.011010 -4.678 2.9e-06 ***
date -0.070162 0.022017 -3.187 0.001439 **
go_out -0.111248 0.030659 -3.629 0.000285 ***
art 0.044390 0.014795 3.000 0.002696 **
clubbing 0.047300 0.012584 3.759 0.000171 ***
reading 0.033694 0.016178 2.083 0.037273 *
movies -0.049285 0.018342 -2.687 0.007210 **
shar1_1 -0.016898 0.004748 -3.559 0.000373 ***
intel2_1 0.008498 0.004742 1.792 0.073088 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 7367.4 on 8216 degrees of freedom
Residual deviance: 7233.9 on 8206 degrees of freedom
(161 observations deleted due to missingness)
AIC: 7255.9
Number of Fisher Scoring iterations: 4
Model 3 has all significant variables. Use McFadden \(R^{2}\) index to assess model fit.
llh llhNull G2 McFadden r2ML
-3.616969e+03 -3.748399e+03 2.628610e+02 3.506310e-02 3.148364e-02
r2CU
5.261114e-02
As we can see, the model has a ‘\(R^{2}\)’ score of 0.035. This is a very low score and it can be attributed to two main factors. The main reason is that the most significant predictors of a match (or the variables with the highest correlation with a match) are values which are recorded during/after the date. As we do not have access to this dataset for LBS students, we have limited ourselves to the pre-date data available. Secondly, the different variables (categorical, ordinal etc.) used to predict a binary variable meant that the usual methods of linear regression were insufficient. In this case, even the logistic regression applied to the dataset may be inadequate and other methods of modelling could be further investigated to improve our results.
# build a prediction dataframe
LBS_predict <- LBS_clean%>%
select(iid,
age,
imprace,
date,
go_out,
art,
clubbing,
reading,
movies,
shar1_1,
intel2_1)
# clean the data
LBS_predict <- LBS_clean %>%
mutate(
age = case_when(age == '22 - 24' ~ 23,
age == '25 +' ~ 25,
age == '19 - 21' ~ 20
),
date = case_when(date=='Several times a year' ~ 6,
date=='Several times a week' ~ 1,
date=='Once a week' ~ 3,
date=='Twice a week' ~ 2,
date=='Almost never' ~7
),
go_out = case_when(go_out=='Several times a year' ~ 6,
go_out=='Several times a week' ~ 1,
go_out=='Once a week' ~ 3,
go_out=='Twice a week' ~ 2,
go_out=='Almost never' ~7
)
)
# predict
LBS_predict <- LBS_predict %>%
mutate(prediction = predict(model3, newdata=LBS_predict, type='response'))
# save to csv and have Garance analyse (only she knows the names)
write.csv(LBS_predict, 'predicted chance of matching.csv')
# let's see the top 20 most matchworthy people
heart <- LBS_predict %>%
arrange(desc(prediction)) %>%
head(20) %>%
select(iid, prediction)
heart| iid | prediction |
| 012 | 0.342 |
| 015 | 0.28 |
| 070 | 0.274 |
| 044 | 0.253 |
| 050 | 0.25 |
| 013 | 0.245 |
| 036 | 0.242 |
| 008 | 0.242 |
| 031 | 0.239 |
| 035 | 0.235 |
| 021 | 0.233 |
| 067 | 0.232 |
| 072 | 0.231 |
| 006 | 0.231 |
| 014 | 0.225 |
| 037 | 0.223 |
| 040 | 0.221 |
| 034 | 0.22 |
| 073 | 0.219 |
| 030 | 0.217 |
We can build a random forest classifier to determine the importance of each variable.
library(randomForest)
# Set the random seed to make this result reproducible
set.seed(50)
# Drop Na so nothing breaks
model_tree <- drop_na(model_df)
# Feed a randomForest model
fit <- randomForest(as.factor(match) ~ age +
imprace +
date +
go_out +
art +
clubbing +
reading +
movies +
shar1_1 +
intel2_1,
data = model_tree,
importance=TRUE,
ntree=1000
)
# Get the importance of the features
# We need to perform several operations on the fit$importance field, including:
# - take only the column we are interested in,
# - create a new column with the rowname on it,
# - rename the columns.
importance.features <- tibble::rownames_to_column(data.frame(fit$importance[,c(1)]))
colnames(importance.features) <- c("rowname", "value")
# Plot the importance of the features for people
ggplot(importance.features, aes(x = reorder(rowname, -value), y = value)) +
geom_bar(stat = "identity", position = "dodge", fill="mistyrose2", colour="black") +
xlab("Feature") + ylab("Count") + ggtitle("Importance of a Feature") +
coord_flip()model_treeM <- model_tree %>%
filter(gender == 1)
# Feed a randomForest model
fit <- randomForest(as.factor(match) ~ age +
imprace +
date +
go_out +
art +
clubbing +
reading +
movies +
shar1_1 +
intel2_1,
data = model_treeM,
importance=TRUE,
ntree=1000
)
# Get the importance of the features
# We need to perform several operations on the fit$importance field, including:
# - take only the column we are interested in,
# - create a new column with the rowname on it,
# - rename the columns.
importance.features <- tibble::rownames_to_column(data.frame(fit$importance[,c(1)]))
colnames(importance.features) <- c("rowname", "value")
# Plot the importance of the features for a man
ggplot(importance.features, aes(x = reorder(rowname, -value), y = value)) +
geom_bar(stat = "identity", position = "dodge", fill="cadetblue3", colour="black") +
xlab("Feature") + ylab("Count") + ggtitle("Importance of a Feature for Men") +
coord_flip()model_treeF <- model_tree %>%
filter(gender == 0)
# Feed a randomForest model
fit <- randomForest(as.factor(match) ~ age +
imprace +
date +
go_out +
art +
clubbing +
reading +
movies +
shar1_1 +
intel2_1,
data = model_treeF,
importance=TRUE,
ntree=1000
)
# Get the importance of the features
# We need to perform several operations on the fit$importance field, including:
# - take only the column we are interested in,
# - create a new column with the rowname on it,
# - rename the columns.
importance.features <- tibble::rownames_to_column(data.frame(fit$importance[,c(1)]))
colnames(importance.features) <- c("rowname", "value")
# Plot the importance of the features for a woman
ggplot(importance.features, aes(x = reorder(rowname, -value), y = value)) +
geom_bar(stat = "identity", position = "dodge", fill="hotpink1", colour="black") +
xlab("Feature") + ylab("Count") + ggtitle("Importance of a Feature for Women") +
coord_flip()